{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Genesis Demonstrator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from typing import Final\n",
    "from config import data_raw_folder, data_processed_folder\n",
    "from timeeval import Datasets\n",
    "from timeeval.datasets import DatasetAnalyzer, DatasetRecord\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/genesis-demonstrator/data and\n",
      "saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed\n"
     ]
    }
   ],
   "source": [
    "dataset_collection_name = \"Genesis\"\n",
    "source_folder = Path(data_raw_folder) / \"genesis-demonstrator/data\"\n",
    "target_folder = Path(data_processed_folder)\n",
    "\n",
    "print(f\"Looking for source datasets in {source_folder.resolve()} and\\nsaving processed datasets in {target_folder.resolve()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset transformation and pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Created directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/Genesis\n"
     ]
    }
   ],
   "source": [
    "train_type = \"unsupervised\"\n",
    "train_is_normal = False\n",
    "input_type = \"multivariate\"\n",
    "datetime_index = True\n",
    "dataset_type = \"real\"\n",
    "\n",
    "# create target directory\n",
    "dataset_subfolder = Path(input_type) / dataset_collection_name\n",
    "target_subfolder = target_folder / dataset_subfolder\n",
    "try:\n",
    "    target_subfolder.mkdir(parents=True, exist_ok=True)\n",
    "    print(f\"Created directories {target_subfolder}\")\n",
    "except FileExistsError:\n",
    "    print(f\"Directories {target_subfolder} already exist\")\n",
    "    pass\n",
    "\n",
    "dm = Datasets(target_folder, create_if_missing=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Preparing dataset\n",
      "Analyzing metadata\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[('Genesis', 'genesis-anomalies') (test)] /home/projects/akita/data/benchmark-data/data-processed/multivariate/Genesis/genesis-anomalies.metadata.json already exists, but 'overwrite' was specified! Ignoring existing contents.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/genesis-demonstrator/data/Genesis_AnomalyLabels.csv -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/Genesis/genesis-anomalies.test.csv\n"
     ]
    }
   ],
   "source": [
    "# get target filenames\n",
    "dataset_name = \"genesis-anomalies\"\n",
    "filename = f\"{dataset_name}.test.csv\"\n",
    "\n",
    "source_file = source_folder / \"Genesis_AnomalyLabels.csv\"\n",
    "path = dataset_subfolder / filename\n",
    "target_filepath = target_subfolder / filename\n",
    "target_meta_filepath = target_filepath.parent / f\"{dataset_name}.{Datasets.METADATA_FILENAME_PREFIX}\"\n",
    "\n",
    "# transform file\n",
    "print(\"Preparing dataset\")\n",
    "df = pd.read_csv(source_file)\n",
    "#df = df.rename(columns={\"Timestamp\": \"timestamp\"})\n",
    "df.insert(len(df.columns), \"is_anomaly\", df.loc[:, \"Label\"])\n",
    "df[\"is_anomaly\"] = (df[\"is_anomaly\"] != 0).astype(np.int_)\n",
    "df.insert(1, \"timestamp\", pd.to_datetime(df[\"Timestamp\"], unit='s'))\n",
    "df = df.drop(columns=[\"Timestamp\", \"Label\"])\n",
    "df.to_csv(target_filepath, index=False)\n",
    "\n",
    "print(\"Analyzing metadata\")\n",
    "da = DatasetAnalyzer((dataset_collection_name, dataset_name), is_train=False, df=df)\n",
    "da.save_to_json(target_meta_filepath, overwrite=True)\n",
    "meta = da.metadata\n",
    "\n",
    "# save metadata\n",
    "dm.add_dataset(DatasetRecord(\n",
    "    collection_name=dataset_collection_name,\n",
    "    dataset_name=dataset_name,\n",
    "    train_path=None,\n",
    "    test_path=path,\n",
    "    dataset_type=dataset_type,\n",
    "    datetime_index=datetime_index,\n",
    "    split_at=None,\n",
    "    train_type=train_type,\n",
    "    train_is_normal=train_is_normal,\n",
    "    input_type=input_type,\n",
    "    length=meta.length,\n",
    "    dimensions=meta.dimensions,\n",
    "    contamination=meta.contamination,\n",
    "    num_anomalies=meta.num_anomalies,\n",
    "    min_anomaly_length=meta.anomaly_length.min,\n",
    "    median_anomaly_length=meta.anomaly_length.median,\n",
    "    max_anomaly_length=meta.anomaly_length.max,\n",
    "    mean=meta.mean,\n",
    "    stddev=meta.stddev,\n",
    "    trend=meta.trend,\n",
    "    stationarity=meta.get_stationarity_name(),\n",
    "    period_size=np.nan\n",
    "))\n",
    "print(f\"Processed source dataset {source_file} -> {target_filepath}\")\n",
    "\n",
    "dm.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>train_path</th>\n",
       "      <th>test_path</th>\n",
       "      <th>dataset_type</th>\n",
       "      <th>datetime_index</th>\n",
       "      <th>split_at</th>\n",
       "      <th>train_type</th>\n",
       "      <th>train_is_normal</th>\n",
       "      <th>input_type</th>\n",
       "      <th>length</th>\n",
       "      <th>dimensions</th>\n",
       "      <th>contamination</th>\n",
       "      <th>num_anomalies</th>\n",
       "      <th>min_anomaly_length</th>\n",
       "      <th>median_anomaly_length</th>\n",
       "      <th>max_anomaly_length</th>\n",
       "      <th>mean</th>\n",
       "      <th>stddev</th>\n",
       "      <th>trend</th>\n",
       "      <th>stationarity</th>\n",
       "      <th>period_size</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dataset_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>genesis-anomalies</th>\n",
       "      <td>NaN</td>\n",
       "      <td>multivariate/Genesis/genesis-anomalies.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>True</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unsupervised</td>\n",
       "      <td>False</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>16220</td>\n",
       "      <td>18</td>\n",
       "      <td>0.003083</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>22</td>\n",
       "      <td>26</td>\n",
       "      <td>11525.074236</td>\n",
       "      <td>9261.502003</td>\n",
       "      <td>no trend</td>\n",
       "      <td>difference_stationary</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  train_path                                        test_path  \\\n",
       "dataset_name                                                                    \n",
       "genesis-anomalies        NaN  multivariate/Genesis/genesis-anomalies.test.csv   \n",
       "\n",
       "                  dataset_type  datetime_index  split_at    train_type  \\\n",
       "dataset_name                                                             \n",
       "genesis-anomalies         real            True       NaN  unsupervised   \n",
       "\n",
       "                   train_is_normal    input_type  length  dimensions  \\\n",
       "dataset_name                                                           \n",
       "genesis-anomalies            False  multivariate   16220          18   \n",
       "\n",
       "                   contamination  num_anomalies  min_anomaly_length  \\\n",
       "dataset_name                                                          \n",
       "genesis-anomalies       0.003083              3                   2   \n",
       "\n",
       "                   median_anomaly_length  max_anomaly_length          mean  \\\n",
       "dataset_name                                                                 \n",
       "genesis-anomalies                     22                  26  11525.074236   \n",
       "\n",
       "                        stddev     trend           stationarity  period_size  \n",
       "dataset_name                                                                  \n",
       "genesis-anomalies  9261.502003  no trend  difference_stationary          NaN  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dm.refresh()\n",
    "dm._df.loc[\"Genesis\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "timeeval",
   "language": "python",
   "name": "timeeval"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
